Importing datasets:

assessments<-read.csv("C:/College stuff/SY/DS/Course Project/Datasets/assessments.csv")
courses<-read.csv("C:/College stuff/SY/DS/Course Project/Datasets/courses.csv")
studentAssessment<-read.csv("C:/College stuff/SY/DS/Course Project/Datasets/studentAssessment.csv")
studentInfo<-read.csv("C:/College stuff/SY/DS/Course Project/Datasets/studentInfo.csv")
studentRegistration<-read.csv("C:/College stuff/SY/DS/Course Project/Datasets/studentRegistration.csv")
studentVle<-read.csv("C:/College stuff/SY/DS/Course Project/Datasets/studentVle.csv")
vle<-read.csv("C:/College stuff/SY/DS/Course Project/Datasets/vle.csv")

Displaying datasets:

assessments
courses
studentAssessment
studentInfo
studentRegistration
studentVle
vle

Importing libraries:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Feature Engineering:

Assessments:

exams <- assessments[assessments$assessment_type == "Exam", ]   #rows od type exam
others <- assessments[assessments$assessment_type != "Exam", ]  #rows of other assessments
amounts <- table(others$code_module, others$code_presentation)

amounts
##      
##       2013B 2013J 2014B 2014J
##   AAA     0     5     0     5
##   BBB    11    11    11     5
##   CCC     0     0     8     8
##   DDD    13     6     6     6
##   EEE     0     4     4     4
##   FFF    12    12    12    12
##   GGG     0     9     9     9
# Convert the table to a data frame
amounts_df <- as.data.frame(amounts)
amounts_df
names(amounts_df) <- c("code_module", "code_presentation", "count")
head(amounts_df)
# here, we have the total number of assessments by module
# A function to determine whether a student has passed a given assessment
pass_fail <- function(grade) {
  if (is.na(grade)) {
    return(NA)
  } else if (grade >= 40) {
    return(TRUE)
  } else {
    return(FALSE)
  }
}

#Creating a student assessment dataframe to join infos about the assessment weights and their respective grades
modifiedStudentAssessment<-merge(studentAssessment, others, by = "id_assessment", all.x = TRUE)
modifiedStudentAssessment
modifiedStudentAssessment<-mutate(modifiedStudentAssessment, pass = sapply(score, pass_fail))
modifiedStudentAssessment
modifiedStudentAssessment<-mutate(modifiedStudentAssessment, weighted_grade = score * weight / 100)
modifiedStudentAssessment
#Final assessment average per student per module
avg_grade <- modifiedStudentAssessment %>%
  group_by(id_student, code_module, code_presentation) %>%
  summarize(total_weighted_grade = sum(weighted_grade, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'id_student', 'code_module'. You can
## override using the `.groups` argument.
avg_grade
head(avg_grade)
# Filter rows where pass is TRUE, group by id_student, code_module, and code_presentation, and count pass occurrences
pass_df <- subset(modifiedStudentAssessment, pass == TRUE)
pass_df
pass_count <- aggregate(pass ~ id_student + code_module + code_presentation, data = pass_df, FUN = length)
pass_count
pass_rate <- merge(pass_count, amounts_df, by = c("code_module", "code_presentation"), all.x = TRUE)
pass_rate
pass_rate$pass_rate <- pass_rate$pass / pass_rate$count
pass_rate
pass_rate <- pass_rate[, c("id_student", "code_module", "code_presentation", "pass_rate")]

head(pass_rate)
pass_rate
# Final exam scores

stud_exams <- transform(merge(modifiedStudentAssessment, exams, by = "id_assessment", all.x = FALSE), exam_score = score)

stud_exams
columns_to_drop_indices <- c(1, 3, 4, 6:12, 15:20)
stud_exams <- stud_exams[, -columns_to_drop_indices]

stud_exams
stud_exams$exam_score = stud_exams$score
stud_exams <- stud_exams[, -c(2)]

head(stud_exams)
stud_exams

VLE

vle
vle_filtered <- subset(vle, !is.na(week_from))
vle_filtered
studentVle
avg_per_site <- studentVle %>% group_by(id_student, id_site, code_module, code_presentation) %>% summarize_all(mean) %>% ungroup()

avg_per_site
avg_per_student <- avg_per_site %>% group_by(id_student, code_module, code_presentation) %>% summarize(date = mean(date), sum_click = mean(sum_click), .groups = "drop")

avg_per_student

StudentInfo

studInfo <- studentInfo %>% filter(final_result != "Withdrawn") %>% select(code_module, code_presentation, id_student, num_of_prev_attempts, final_result)

studInfo

Compiling all relevant tables

df_1 <- inner_join(avg_grade, pass_rate, by = c("id_student", "code_module", "code_presentation"))

names(stud_exams) <- c("id_student", "code_module", "code_presentation", "exam_score")

assessment_info <- inner_join(df_1, stud_exams, by = c("id_student", "code_module", "code_presentation"))

assessment_info
df_2 <- inner_join(studInfo, assessment_info, by = c("id_student", "code_module", "code_presentation"))

final_df <- inner_join(df_2, avg_per_student, by = c("id_student", "code_module", "code_presentation"))

final_df
final_df <- select(final_df, -id_student, -code_module, -code_presentation)

final_df

EDA (Exploratory Data Analysis)

library(psych)

describe(final_df)
str(final_df)
## 'data.frame':    4950 obs. of  7 variables:
##  $ num_of_prev_attempts: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ final_result        : chr  "Distinction" "Pass" "Pass" "Pass" ...
##  $ total_weighted_grade: num  89.6 84.6 51.4 75.1 93.2 ...
##  $ pass_rate           : num  1 1 0.625 1 1 1 0.5 1 1 0.875 ...
##  $ exam_score          : int  94 76 66 50 98 100 68 84 90 66 ...
##  $ date                : num  103.5 87.6 49.2 118.7 75.8 ...
##  $ sum_click           : num  2.71 1.54 1.56 2.19 2.1 ...
glimpse(final_df)
## Rows: 4,950
## Columns: 7
## $ num_of_prev_attempts <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ final_result         <chr> "Distinction", "Pass", "Pass", "Pass", "Distincti…
## $ total_weighted_grade <dbl> 89.65, 84.58, 51.44, 75.13, 93.22, 91.41, 18.23, …
## $ pass_rate            <dbl> 1.000, 1.000, 0.625, 1.000, 1.000, 1.000, 0.500, …
## $ exam_score           <int> 94, 76, 66, 50, 98, 100, 68, 84, 90, 66, 100, 82,…
## $ date                 <dbl> 103.45791, 87.61726, 49.18129, 118.69864, 75.7919…
## $ sum_click            <dbl> 2.706754, 1.539047, 1.562619, 2.189217, 2.100617,…
quant_final_df <- final_df[, -which(names(final_df) %in% c("final_result"))]
quant_final_df
col1 <- colorRampPalette(c("#7F0000", "red", "#FF7F00", "yellow", "white", "cyan", 
    "#007FFF", "blue", "#00007F"))
col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7", 
    "#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC", "#053061"))
col3 <- colorRampPalette(c("red", "white", "blue"))
col4 <- colorRampPalette(c("#7F0000", "red", "#FF7F00", "yellow", "#7FFF7F", 
    "cyan", "#007FFF", "blue", "#00007F"))
wb <- c("white", "black")
library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(quant_final_df)
cor_matrix
##                      num_of_prev_attempts total_weighted_grade   pass_rate
## num_of_prev_attempts           1.00000000          -0.06306455 -0.03817519
## total_weighted_grade          -0.06306455           1.00000000  0.87427114
## pass_rate                     -0.03817519           0.87427114  1.00000000
## exam_score                    -0.10428765           0.44905358  0.27791733
## date                           0.11032714           0.02119136  0.02755571
## sum_click                     -0.07787152          -0.17123406 -0.19486350
##                         exam_score       date     sum_click
## num_of_prev_attempts -0.1042876471 0.11032714 -0.0778715206
## total_weighted_grade  0.4490535767 0.02119136 -0.1712340590
## pass_rate             0.2779173251 0.02755571 -0.1948635039
## exam_score            1.0000000000 0.09634348 -0.0001294788
## date                  0.0963434802 1.00000000  0.2148798489
## sum_click            -0.0001294788 0.21487985  1.0000000000
summary(cor_matrix)
##  num_of_prev_attempts total_weighted_grade   pass_rate       
##  Min.   :-0.10429     Min.   :-0.1712      Min.   :-0.19486  
##  1st Qu.:-0.07417     1st Qu.:-0.0420      1st Qu.:-0.02174  
##  Median :-0.05062     Median : 0.2351      Median : 0.15274  
##  Mean   : 0.13782     Mean   : 0.3517      Mean   : 0.32445  
##  3rd Qu.: 0.07320     3rd Qu.: 0.7680      3rd Qu.: 0.72518  
##  Max.   : 1.00000     Max.   : 1.0000      Max.   : 1.00000  
##    exam_score            date           sum_click      
##  Min.   :-0.10429   Min.   :0.02119   Min.   :-0.1949  
##  1st Qu.: 0.02399   1st Qu.:0.04475   1st Qu.:-0.1479  
##  Median : 0.18713   Median :0.10334   Median :-0.0390  
##  Mean   : 0.28648   Mean   :0.24505   Mean   : 0.1285  
##  3rd Qu.: 0.40627   3rd Qu.:0.18874   3rd Qu.: 0.1611  
##  Max.   : 1.00000   Max.   :1.00000   Max.   : 1.0000
par(mar = c(2,2,2,2))

corrplot(cor_matrix, method = "circle", type = "full", addrect = 2, col = col1(100), tl.col = "black", tl.srt = 45, addCoef.col = "black", cl.ratio = 0.4, cl.align = "r")

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(final_df, aes(x = final_result, fill = final_result)) + geom_bar() + labs(title = "Count Plot of Final Results", x = "Final Result", y = "Count") + theme_minimal() + scale_fill_manual(values = c("Pass" = "green", "Fail" = "red", "Distinction" = "blue", "Withdrawn" = "gray"))

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(quant_final_df)

subset(final_df, sum_click > 10)
subset(final_df, num_of_prev_attempts > 4)
filtered_df <- subset(final_df, final_df$sum_click <=10)
filtered_df <- subset(final_df, final_df$num_of_prev_attempts <=4)
filtered_df

Modeling

# Load the required library
library(caret)
## Loading required package: lattice
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
# Split the data into training and testing sets
set.seed(123)  # Set a seed for reproducibility
splitIndex <- createDataPartition(final_df$final_result, p = 0.7, list = FALSE)
X_train <- final_df[splitIndex, !names(final_df) %in% "final_result"]
y_train <- final_df$final_result[splitIndex]
X_test <- final_df[-splitIndex, !names(final_df) %in% "final_result"]
y_test <- final_df$final_result[-splitIndex]

# Create scalers for each dataset
scaler1 <- preProcess(X_train, method = c("range"))
scaler2 <- preProcess(X_train[, !names(X_train) %in% "weighted_grade"], method = c("range"))
scaler3 <- preProcess(X_train[, !names(X_train) %in% "pass_rate"], method = c("range"))

# Apply the scalers to the datasets
X1_train <- predict(scaler1, X_train)
X1_test <- predict(scaler1, X_test)
X2_train <- predict(scaler2, X_train[, !names(X_train) %in% "weighted_grade"])
X2_test <- predict(scaler2, X_test[, !names(X_test) %in% "weighted_grade"])
X3_train <- predict(scaler3, X_train[, !names(X_train) %in% "pass_rate"])
X3_test <- predict(scaler3, X_test[, !names(X_test) %in% "pass_rate"])

Linear Discriminant Analysis

# Load the required library
library(MASS)  # This package contains Linear Discriminant Analysis
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# Fit LDA on the training data
lda1 <- lda(y_train ~ ., data = data.frame(X1_train, y_train))

# Transform the test data
X1_test_lda <- predict(lda1, newdata = data.frame(X1_test))

# Predict on the transformed test data
result_lda1 <- predict(lda1, newdata = data.frame(X1_test))

# Calculate and print confusion matrix
confusion_matrix <- table(Actual = y_test, Predicted = result_lda1$class)
print(confusion_matrix)
##              Predicted
## Actual        Distinction Fail Pass
##   Distinction         231    0   33
##   Fail                  0  151   49
##   Pass                 49   48  923
cat("\n")
# Calculate and print additional metrics
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.8793801
cat("Precision: ", precision, "\n")
## Precision:  0.758794
cat("Recall: ", recall, "\n")
## Recall:  0.755
cat("F1 Score: ", f1_score, "\n")
## F1 Score:  0.7568922
# Fit LDA on the training data for dataset 2
lda2 <- lda(y_train ~ ., data = data.frame(X2_train, y_train))

# Transform the test data for dataset 2
X2_test_lda <- predict(lda2, newdata = data.frame(X2_test))

# Predict on the transformed test data for dataset 2
result_lda2 <- predict(lda2, newdata = data.frame(X2_test))

# Calculate and print confusion matrix for dataset 2
confusion_matrix <- table(Actual = y_test, Predicted = result_lda2$class)
print(confusion_matrix)
##              Predicted
## Actual        Distinction Fail Pass
##   Distinction         231    0   33
##   Fail                  0  151   49
##   Pass                 49   48  923
cat("\n")
# Calculate and print additional metrics for dataset 2
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.8793801
cat("Precision: ", precision, "\n")
## Precision:  0.758794
cat("Recall: ", recall, "\n")
## Recall:  0.755
cat("F1 Score: ", f1_score, "\n")
## F1 Score:  0.7568922
# Fit LDA on the training data for dataset 3
lda3 <- lda(y_train ~ ., data = data.frame(X3_train, y_train))

# Transform the test data for dataset 3
X3_test_lda <- predict(lda3, newdata = data.frame(X3_test))

# Predict on the transformed test data for dataset 3
result_lda3 <- predict(lda3, newdata = data.frame(X3_test))

# Calculate and print confusion matrix for dataset 3
confusion_matrix <- table(Actual = y_test, Predicted = result_lda3$class)
print(confusion_matrix)
##              Predicted
## Actual        Distinction Fail Pass
##   Distinction         234    0   30
##   Fail                  0  160   40
##   Pass                 52   45  923
cat("\n")
# Calculate and print additional metrics for dataset 3
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.8874663
cat("Precision: ", precision, "\n")
## Precision:  0.7804878
cat("Recall: ", recall, "\n")
## Recall:  0.8
cat("F1 Score: ", f1_score, "\n")
## F1 Score:  0.7901235